library(tidyverse) #importing, tidying, plotting data
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr) #making tables
library(leaflet)
library(tinytex) #may need for knitting pdf versions of .rmd file
library(ggplot2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(ggfortify)

R Markdown

t.test This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

First, lets make a simple graph using some base R code.

#Body condition factor is BCF = ((body weight(g) - gut content(g)) / length(cm)^3) *100.

library(readr)
Site_BCF <- read_csv("data/Site_BCF.csv")
## Rows: 21 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Site
## dbl (2): Site_Location, Body_Condition_Factor
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(Site_BCF, aes(x = Body_Condition_Factor)) +
    geom_histogram(binwidth = 1) +
    facet_wrap(~ Site, ncol = 1) +
    theme_bw()

library(readr)
Site_BodyConditionFactor <- read_csv("data/Site_BodyConditionFactor.csv")
## Rows: 21 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): Site_Location, Body_Condition_Factor
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Now looking into Binary Data

ggplot(Site_BodyConditionFactor,aes(Body_Condition_Factor,Site_Location)) +
  geom_point() +
  geom_smooth() +
  xlab ("Body Condition Factor") +
  ylab ("Site") +
  labs(title="Raw Fit: 1=Wall Branch, 0=Dry Fork")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Before we fit the GLM we can avoid inconveniently small coefficient values by rescaling distance in hundreds of meters.

Site_BodyConditionFactor$BCF100 <- Site_BodyConditionFactor$Body_Condition_Factor/100
fit.1 <- glm(Site_Location~BCF100, data=Site_BodyConditionFactor, binomial(link="logit"))
autoplot(fit.1)

library(arm)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: lme4
## 
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/ridew/OneDrive/Documents/Advanced Analytics/GLM_Assignment/GLMAssignment
x <- predict(fit.1)
y <- resid(fit.1)
binnedplot(x, y)

coef(fit.1)
## (Intercept)      BCF100 
##    1.966753 -257.308577
confint(fit.1)
## Waiting for profiling to be done...
##                    2.5 %   97.5 %
## (Intercept)    -7.210625  11.7298
## BCF100      -1470.488665 880.3547
ggplot(Site_BodyConditionFactor, aes(Body_Condition_Factor,Site_Location)) +
  geom_point() +
  geom_smooth(method="glm", method.args=list(family="binomial"(link="logit"))) + 
  xlab ("Body Condition Factor") +
  ylab ("Site") +
  labs (title="Raw Fit: 1=Wall Branch, 0=Dry Fork")
## `geom_smooth()` using formula 'y ~ x'

Figure 1. Length Measurements

Figure 2. Fathead Minnow Gonads (Testes)

library(readr)
GSI_RedColorationArea <- read_csv("data/GSI_RedColorationArea.csv")
## Rows: 21 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): GSI_Value, Red_Coloration_Area
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(data = GSI_RedColorationArea, aes(x=GSI_Value, y=Red_Coloration_Area)) +
geom_point()+
  xlab("GSI") +
    ylab("Red Coloration %") +
theme_bw()+
geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'

ggsave("outputs/ScatterGSI_Area.png")
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
FitGSI_Area <- lm(data= GSI_RedColorationArea, Red_Coloration_Area~GSI_Value)
summary(FitGSI_Area)
## 
## Call:
## lm(formula = Red_Coloration_Area ~ GSI_Value, data = GSI_RedColorationArea)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.129  -4.093   0.079   4.519  13.015 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0913     2.5593   5.506  2.6e-05 ***
## GSI_Value    -0.1511     0.3120  -0.484    0.634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.55 on 19 degrees of freedom
## Multiple R-squared:  0.01219,    Adjusted R-squared:  -0.0398 
## F-statistic: 0.2345 on 1 and 19 DF,  p-value: 0.6337
autoplot(FitGSI_Area)

anova(FitGSI_Area)
## Analysis of Variance Table
## 
## Response: Red_Coloration_Area
##           Df  Sum Sq Mean Sq F value Pr(>F)
## GSI_Value  1   13.37  13.368  0.2345 0.6337
## Residuals 19 1083.04  57.002
boxcox(FitGSI_Area)

# #see linear regression for FitGSI_Area

#The R output for the boxcox() function plots the maximum likelihood surface (the curve) together with a maximum likelihood-based 95% CI (Hector, 2015)

#Helpful GLM component info from Hector, 2015 Ch8

#GLMs have three components: # 1.) a linear predictor- is what comes after the tilde (~) in our linear model formula # 2.) a variance function - models the variation in the data;make use of a much wider rangefamily of distributions including the poisson, the binomial, and the gamma. #3.) a link function- plays a role equivalent to the transformation in normal least squares models. However, rather than transforming the data we transform the predictions made by the linear predictor. Commonly used link functions include the log, square root, and logistic.

GLMGSI_Area <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family= gaussian(link=)) 
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
  geom_point() +
  geom_smooth(method="glm",colour="red",
                       method.args=list(family="gaussian"(link=)))+
  labs(title="GLM, Gaussian")
## `geom_smooth()` using formula 'y ~ x'

autoplot(GLMGSI_Area)

anova(GLMGSI_Area)
## Analysis of Deviance Table
## 
## Model: gaussian, link: identity
## 
## Response: Red_Coloration_Area
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev
## NULL                         20     1096.4
## GSI_Value  1   13.368        19     1083.0
summary(GLMGSI_Area)
## 
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = gaussian(link = ), 
##     data = GSI_RedColorationArea)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.129   -4.093    0.079    4.519   13.015  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0913     2.5593   5.506  2.6e-05 ***
## GSI_Value    -0.1511     0.3120  -0.484    0.634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 57.00198)
## 
##     Null deviance: 1096.4  on 20  degrees of freedom
## Residual deviance: 1083.0  on 19  degrees of freedom
## AIC: 148.4
## 
## Number of Fisher Scoring iterations: 2

#log data with Gaussian

gaussianlog <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family= gaussian(link=log)) 
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
  geom_point() +
  geom_smooth(method="glm",colour="red",
                       method.args=list(family="gaussian"(link=log)))+
  labs(title="GLM Log, Gaussian")
## `geom_smooth()` using formula 'y ~ x'

anova(gaussianlog)
## Analysis of Deviance Table
## 
## Model: gaussian, link: log
## 
## Response: Red_Coloration_Area
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev
## NULL                         20     1096.4
## GSI_Value  1   12.771        19     1083.6
summary(gaussianlog)
## 
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = gaussian(link = log), 
##     data = GSI_RedColorationArea)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -11.1126   -4.0765    0.0993    4.5325   13.0334  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.64598    0.19313  13.700 2.68e-11 ***
## GSI_Value   -0.01144    0.02585  -0.443    0.663    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 57.03351)
## 
##     Null deviance: 1096.4  on 20  degrees of freedom
## Residual deviance: 1083.6  on 19  degrees of freedom
## AIC: 148.41
## 
## Number of Fisher Scoring iterations: 5

Family: Gamma

gamma <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family=Gamma(link= )) 
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
  geom_point() +
  geom_smooth(method="glm",colour="red",
                       method.args=list(family="Gamma"(link=)))+
  labs(title="GLM, Gamma Variance")
## `geom_smooth()` using formula 'y ~ x'

anova(gamma)
## Analysis of Deviance Table
## 
## Model: Gamma, link: inverse
## 
## Response: Red_Coloration_Area
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev
## NULL                         20     9.2373
## GSI_Value  1 0.081995        19     9.1553
summary(gamma)
## 
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = Gamma(link = ), 
##     data = GSI_RedColorationArea)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.32954  -0.33759   0.00693   0.29699   0.75846  
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0704008  0.0145716   4.831 0.000116 ***
## GSI_Value   0.0009547  0.0019582   0.488 0.631449    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.3254893)
## 
##     Null deviance: 9.2373  on 20  degrees of freedom
## Residual deviance: 9.1553  on 19  degrees of freedom
## AIC: 148.63
## 
## Number of Fisher Scoring iterations: 5
gamma_sqrt <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family=Gamma(link= "sqrt"))
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
  geom_point() +
  geom_smooth(method="glm",colour="red",
                       method.args=list(family="Gamma"(link="sqrt")))+
  labs(title="GLM, square-root link, Gamma variance")
## `geom_smooth()` using formula 'y ~ x'

summary(gamma_sqrt)
## 
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = Gamma(link = "sqrt"), 
##     data = GSI_RedColorationArea)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.33108  -0.33998   0.00384   0.29437   0.75474  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.76884    0.35177   10.71 1.71e-09 ***
## GSI_Value   -0.02314    0.04063   -0.57    0.576    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.3243847)
## 
##     Null deviance: 9.2373  on 20  degrees of freedom
## Residual deviance: 9.1481  on 19  degrees of freedom
## AIC: 148.61
## 
## Number of Fisher Scoring iterations: 6
coef(gamma_sqrt)
## (Intercept)   GSI_Value 
##  3.76884396 -0.02314351
confint(gamma_sqrt)
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept)  3.1102085 4.54244977
## GSI_Value   -0.0978116 0.07400634
gammalog <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family=Gamma(link= "log"))
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
  geom_point() +
  geom_smooth(method="glm",colour="red",
                       method.args=list(family="Gamma"(link="log")))+
  labs(title="GLM, log link, Gamma variance")
## `geom_smooth()` using formula 'y ~ x'

anova(gammalog)
## Analysis of Deviance Table
## 
## Model: Gamma, link: log
## 
## Response: Red_Coloration_Area
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev
## NULL                         20     9.2373
## GSI_Value  1  0.08659        19     9.1508
summary(gammalog)
## 
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = Gamma(link = "log"), 
##     data = GSI_RedColorationArea)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.33051  -0.33911   0.00496   0.29534   0.75609  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.65341    0.19319  13.735 2.57e-11 ***
## GSI_Value   -0.01268    0.02355  -0.538    0.597    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.324797)
## 
##     Null deviance: 9.2373  on 20  degrees of freedom
## Residual deviance: 9.1508  on 19  degrees of freedom
## AIC: 148.62
## 
## Number of Fisher Scoring iterations: 5
coef(gammalog)
## (Intercept)   GSI_Value 
##  2.65341089 -0.01268145
confint(gammalog)
## Waiting for profiling to be done...
##                  2.5 %     97.5 %
## (Intercept)  2.2800283 3.05155112
## GSI_Value   -0.0578871 0.03754406
leaflet() %>%
  setView(-86.854396, 36.26361 , zoom = 16) %>% #lat-long of the place of interest
  addTiles() %>%
  addProviderTiles('Esri.WorldImagery') %>%
  addMarkers(-86.854396, 36.26361 , popup = "Dry Fork, Whites Creek System")
leaflet() %>%
  setView(-87.287965, 36.499277 , zoom = 16) %>% #lat-long of the place of interest
  addTiles() %>%
  addProviderTiles('Esri.WorldImagery') %>%
  addMarkers(-87.287965, 36.499277 , popup = "Rotary Park, Wall Branch Creek System")

Figure 3. Southern Redbelly Dace in Full Breeding Coloration (Chrosomus erythrogaster)